In [1]:
%load_ext nb_black
In [95]:
import pandas as pd
import sys
from sklearn.linear_model import LogisticRegression, LinearRegression
from collections import Counter, defaultdict
import numpy as np
import plotly.express as px
import warnings
from scipy.stats import spearmanr, kendalltau
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, vstack
from scipy.special import logit, expit

Подготовим данные и удалим лишнее¶

In [3]:
results_data = pd.read_pickle("data/results.pkl")
display(sys.getsizeof(results_data) / 1024)

results_data = {
    k: v
    for k, v in results_data.items()
    if all([t.get("mask", None) is not None for t in results_data[k]])
}
display(sys.getsizeof(results_data) / 1024)
288.0859375
144.09375
In [4]:
players_df = pd.DataFrame.from_dict(pd.read_pickle("data/players.pkl"), orient="index")
players_df.index = players_df.id
players_df.drop(["id"], axis=1, inplace=True)

tournaments_df = pd.DataFrame.from_dict(
    pd.read_pickle("data/tournaments.pkl"), orient="index"
)
tournaments_df.index = tournaments_df.id
tournaments_df.drop(["id"], axis=1, inplace=True)
tournaments_df.dateStart = pd.to_datetime(tournaments_df.dateStart, utc=True)
tournaments_df.dateEnd = pd.to_datetime(tournaments_df.dateEnd, utc=True)
tournaments_df = tournaments_df.loc[[k for k, v in results_data.items() if len(v)]]
In [5]:
year_start_train, year_start_test, year_stop_test = ("2019", "2020", "2021")
tournaments_df_train = tournaments_df[
    (tournaments_df.dateStart >= year_start_train)
    & (tournaments_df.dateStart < year_start_test)
]
tournaments_df_test = tournaments_df[
    (tournaments_df.dateStart >= year_start_test)
    & (tournaments_df.dateStart < year_stop_test)
]
tournaments_df_train.shape, tournaments_df_test.shape
Out[5]:
((673, 8), (167, 8))
In [6]:
max_mask_len = max(len(t["mask"]) for r in results_data.values() for t in r)
max_mask_len
Out[6]:
500
In [7]:
results_data = {
    i: results_data[i]
    for i in tournaments_df[
        (tournaments_df.dateStart >= year_start_train)
        & (tournaments_df.dateStart < year_stop_test)
    ].index
}
display(sys.getsizeof(results_data) / 1024)
36.09375
In [8]:
assert len(results_data) == tournaments_df_train.shape[0] + tournaments_df_test.shape[0]
assert all(t.year == 2019 for t in tournaments_df_train.dateStart)
assert all(t.year == 2020 for t in tournaments_df_test.dateStart)
In [9]:
display(tournaments_df_train, players_df, next(iter(results_data.values()))[0])
name dateStart dateEnd type season orgcommittee synchData questionQty
id
4772 Синхрон северных стран. Зимний выпуск 2019-01-05 16:00:00+00:00 2019-01-09 16:00:00+00:00 {'id': 3, 'name': 'Синхрон'} /seasons/52 [{'id': 28379, 'name': 'Константин', 'patronym... {'dateRequestsAllowedTo': '2019-01-09T23:59:59... {'1': 12, '2': 12, '3': 12}
4973 Балтийский Берег. 3 игра 2019-01-25 16:05:00+00:00 2019-01-29 16:00:00+00:00 {'id': 3, 'name': 'Синхрон'} /seasons/52 [{'id': 23030, 'name': 'Марина', 'patronymic':... {'dateRequestsAllowedTo': '2019-01-28T23:59:59... {'1': 12, '2': 12, '3': 12}
4974 Балтийский Берег. 4 игра 2019-03-01 16:05:00+00:00 2019-03-05 16:00:00+00:00 {'id': 3, 'name': 'Синхрон'} /seasons/52 [{'id': 23030, 'name': 'Марина', 'patronymic':... {'dateRequestsAllowedTo': '2019-03-04T23:59:59... {'1': 12, '2': 12, '3': 12}
4975 Балтийский Берег. 5 игра 2019-04-05 16:05:00+00:00 2019-04-09 16:00:00+00:00 {'id': 3, 'name': 'Синхрон'} /seasons/52 [{'id': 23030, 'name': 'Марина', 'patronymic':... {'dateRequestsAllowedTo': '2019-04-08T23:59:59... {'1': 12, '2': 12, '3': 12}
4986 ОВСЧ. 6 этап 2019-02-15 17:00:00+00:00 2019-02-19 17:00:00+00:00 {'id': 3, 'name': 'Синхрон'} /seasons/52 [{'id': 59140, 'name': 'Борис', 'patronymic': ... {'dateRequestsAllowedTo': '2019-02-19T23:59:59... {'1': 12, '2': 12, '3': 12}
... ... ... ... ... ... ... ... ...
6173 Кубок Мэра Казани 2019-12-15 07:00:00+00:00 2019-12-15 12:00:00+00:00 {'id': 2, 'name': 'Обычный'} /seasons/53 [{'id': 33624, 'name': 'Ренат', 'patronymic': ... None {'1': 15, '2': 15, '3': 15, '4': 15}
6191 Всеармянский Интеллектуальный Фестиваль 2019-12-22 09:00:00+00:00 2019-12-22 13:00:00+00:00 {'id': 2, 'name': 'Обычный'} /seasons/53 [{'id': 19981, 'name': 'Сейран', 'patronymic':... None {'1': 12, '2': 12, '3': 12}
6249 Школьный синхрон-lite. Сезон 3 2019-08-31 21:05:00+00:00 2020-04-30 20:55:00+00:00 {'id': 5, 'name': 'Общий зачёт'} /seasons/53 [{'id': 23740, 'name': 'Владимир', 'patronymic... None {'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ...
6254 Школьная лига 2019-10-04 16:00:00+00:00 2020-03-22 16:00:00+00:00 {'id': 5, 'name': 'Общий зачёт'} /seasons/53 [{'id': 39218, 'name': 'Владислав', 'patronymi... None {'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ...
6255 ОВСЧ 2019-09-20 17:00:00+00:00 2020-02-19 20:59:00+00:00 {'id': 5, 'name': 'Общий зачёт'} /seasons/53 [{'id': 32901, 'name': 'Наиль', 'patronymic': ... None {'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ...

673 rows × 8 columns

name patronymic surname
id
1 Алексей None Абабилов
10 Игорь Абалов
11 Наталья Юрьевна Абалымова
12 Артур Евгеньевич Абальян
13 Эрик Евгеньевич Абальян
... ... ... ...
224700 Артём Евгеньевич Садов
224701 Даниил Олегович Трефилов
224702 Владимир Араратович Басенцян
224703 Руслан Ринатович Дауранов
224704 Александр Викторович Гапонов

204063 rows × 3 columns

{'team': {'id': 45556,
  'name': 'Рабочее название',
  'town': {'id': 285, 'name': 'Санкт-Петербург'}},
 'mask': '111111111011111110111111111100010010',
 'current': {'name': 'Рабочее название',
  'town': {'id': 285, 'name': 'Санкт-Петербург'}},
 'questionsTotal': 28,
 'synchRequest': {'id': 56392,
  'venue': {'id': 3030, 'name': 'Санкт-Петербург'}},
 'position': 1,
 'controversials': [{'id': 91169,
   'questionNumber': 15,
   'answer': 'Мьёльнир',
   'issuedAt': '2019-01-06T13:28:48+03:00',
   'status': 'A',
   'comment': '',
   'resolvedAt': '2019-01-06T15:25:54+03:00',
   'appealJuryComment': None}],
 'flags': [],
 'teamMembers': [{'flag': 'Б',
   'usedRating': 13507,
   'rating': 13507,
   'player': {'id': 6212,
    'name': 'Юрий',
    'patronymic': 'Яковлевич',
    'surname': 'Выменец'}},
  {'flag': 'Б',
   'usedRating': 10988,
   'rating': 13185,
   'player': {'id': 18332,
    'name': 'Александр',
    'patronymic': 'Витальевич',
    'surname': 'Либер'}},
  {'flag': 'Б',
   'usedRating': 8534,
   'rating': 12801,
   'player': {'id': 18036,
    'name': 'Михаил',
    'patronymic': 'Ильич',
    'surname': 'Левандовский'}},
  {'flag': 'К',
   'usedRating': 6401,
   'rating': 12801,
   'player': {'id': 22799,
    'name': 'Сергей',
    'patronymic': 'Игоревич',
    'surname': 'Николенко'}},
  {'flag': 'Б',
   'usedRating': 4252,
   'rating': 12757,
   'player': {'id': 15456,
    'name': 'Сергей',
    'patronymic': 'Владимирович',
    'surname': 'Коновалов'}},
  {'flag': 'Б',
   'usedRating': 2069,
   'rating': 12416,
   'player': {'id': 26089,
    'name': 'Ирина',
    'patronymic': 'Сергеевна',
    'surname': 'Прокофьева'}}]}
In [10]:
players_games: dict[int, list[list[tuple[int, int]]]] = defaultdict(lambda: [])
used_cnt = 0
skiped_cnt = 0
for t_id in tournaments_df_train.index:
    data = results_data[t_id]
    answers_cnt = Counter([len(t["mask"]) for t in data])
    if len(answers_cnt) != 1:
        skiped_cnt += answers_cnt.total()
        continue
    used_cnt += answers_cnt.total()
    results = np.zeros((len(data), answers_cnt.most_common(1)[0][0]), dtype=bool)

    for idx, team in enumerate(data):
        mask = list(map(int, team["mask"].replace("?", "1").replace("X", "0")))
        results[idx] = np.array(mask, dtype=bool)
    questions_power = 1 - results.mean(axis=0)

    for idx, team in enumerate(data):
        mask = list(map(int, team["mask"].replace("?", "1").replace("X", "0")))
        results[idx] = np.array(mask, dtype=bool)
        for u in team["teamMembers"]:
            players_games[u["player"]["id"]].append(
                list(zip(mask, list(questions_power)))
            )
f"пропущено {skiped_cnt / used_cnt * 100:.0f}% данных из-за различной длины вопросов внутри соревнования"
Out[10]:
'пропущено 11% данных из-за различной длины вопросов внутри соревнования'

Смотрим статистики игроков на основе команд, в которых они играли

In [11]:
# player = players_games[22799]
def plot_answers_for_player(player: list[list[tuple[int, int]]]):
    display(
        px.scatter(
            x=[q[1] for g in player for q in g],
            y=[q[0] for g in player for q in g],
            title="Игрок",
        ).update_layout(
            xaxis_title="Сложность вопроса",
            yaxis_title="Правильный/неправильный ответ",
        )
    )
    display(
        px.histogram(
            x=[q[1] for g in player for q in g if q[0] == 1],
            nbins=20,
            title="распределение правильных ответов",
        ).update_layout(
            xaxis_title="Сложность вопроса",
            yaxis_title="количество правильных ответов",
        )
    )
    display(
        px.histogram(
            x=[q[1] for g in player for q in g if q[0] == 0],
            nbins=20,
            title="распределение неправильных ответов",
        ).update_layout(
            xaxis_title="Сложность вопроса",
            yaxis_title="количество неправильных ответов",
        )
    )


plot_answers_for_player(players_games[22799])
plot_answers_for_player(players_games[87797])
plot_answers_for_player(players_games[87509])
In [12]:
def player_stats(player):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        corr = np.mean([q[1] for g in player for q in g if q[0] == 1])
        incorr = np.mean([q[1] for g in player for q in g if q[0] == 0])
    if np.isnan(corr):
        corr = 0
    if np.isnan(incorr):
        incorr = 0

    return corr, incorr


(
    player_stats(players_games[22799]),
    player_stats(players_games[87797]),
    player_stats(players_games[87509]),
)
Out[12]:
((0.4975770084453995, 0.8328454403839074),
 (0.43555547069053063, 0.7163843381518152),
 (0.42532894807533034, 0.7055017639185375))
In [13]:
features = 40


def features_for_player(player):
    return np.hstack(
        (
            np.histogram(
                [q[1] for g in player for q in g if q[0] == 1],
                bins=features // 2,
                range=(0, 1),
                density=False,
            )[0],
            np.histogram(
                [q[1] for g in player for q in g if q[0] == 0],
                bins=features // 2,
                range=(0, 1),
                density=False,
            )[0],
        )
    )


display(px.line(features_for_player(players_games[22799])))
display(px.line(features_for_player(players_games[87797])))
display(px.line(features_for_player(players_games[87509])))

Статистики команд на основе статистик игроков, которые входят в эту команду

In [14]:
def team_features(players):
    team = np.empty((len(players), features))
    for idx, player in enumerate(players):
        team[idx] = features_for_player(player)
    return team.mean(axis=0)


display(
    px.line(
        team_features(
            [players_games[22799], players_games[87797], players_games[87509]]
        )
    )
)
display(
    px.line(
        team_features([players_games[14518], players_games[12770], players_games[6064]])
    )
)
In [15]:
def team_stats(players):
    if len(players) == 0:
        return (0, 0)
    if isinstance(players[0], int):
        players = [players_games[p] for p in players]
    team_corr = []
    team_incorr = []
    for player in players:
        corr, incorr = player_stats(player)
        team_corr.append(corr)
        team_incorr.append(incorr)
    return np.mean(team_corr), np.mean(team_incorr)


(
    team_stats([players_games[22799], players_games[87797], players_games[87509]]),
    team_stats([22799, 87797, 87509]),
    team_stats([players_games[14518], players_games[12770], players_games[6064]]),
)
Out[15]:
((0.45282047573708684, 0.7515771808180866),
 (0.45282047573708684, 0.7515771808180866),
 (0.4615331484659509, 0.7518183967285608))

Ресзультат

In [166]:
def compute_metrics(df):
    metric_spearman = []
    metric_kendall = []
    for tour_id, _ in df.iterrows():
        check = [
            np.mean(team_stats([p["player"]["id"] for p in t["teamMembers"]]))
            for t in results_data[tour_id]
        ]
        #     display(px.line(y=check))
        spearman = spearmanr(check, -np.arange(len(check))).correlation
        kendall = kendalltau(check, -np.arange(len(check))).correlation
        if np.isnan(spearman):
            spearman = 0
        if np.isnan(kendall):
            kendall = 0
        metric_spearman.append(spearman)
        metric_kendall.append(kendall)

    metric_spearman = np.mean(metric_spearman)
    metric_kendall = np.mean(metric_kendall)
    return metric_spearman, metric_kendall


print("Train:", compute_metrics(tournaments_df_train))
print("Test:", compute_metrics(tournaments_df_test))
Train: (0.7017189636014557, 0.542864839188076)
Test: (0.6342618710526812, 0.47879146683885)

Это было решение через статистики, которое, к сожалению, не требуется в данной задаче. Поэтому, далее надо построить матрицу для M шага.

In [17]:
players_games_df = []
for t_id in tournaments_df_train.index:
    for team in results_data[t_id]:
        mask = list(map(int, team["mask"].replace("?", "1").replace("X", "0")))
        for player in team["teamMembers"]:
            for question_idx, answer in enumerate(mask):
                players_games_df.append(
                    (
                        t_id * max_mask_len * 2 + question_idx,  # question_id
                        player["player"]["id"],  # player_id
                        team["team"]["id"],  # team_id
                        answer,  # answer
                    )
                )
players_games_df = np.array(players_games_df)
players_games_df.shape
Out[17]:
21066091

Далее делается датасет для предсказания вероятности ответа на вопрос

In [18]:
(players_games_df[0], players_games_df[100])
Out[18]:
(array([4772000,    6212,   45556,       1]),
 array([4772028,   18036,   45556,       0]))
In [23]:
question_enc = OneHotEncoder()
player_enc = OneHotEncoder()
X = hstack(
    (
        player_enc.fit_transform(players_games_df[:, 1].reshape(-1, 1)),
        question_enc.fit_transform(players_games_df[:, 0].reshape(-1, 1)),
    )
)
X.shape
(21066091, 92270)
In [24]:
baseline_model = LogisticRegression(random_state=0, class_weight="balanced", solver="liblinear")
baseline_model.fit(X, players_games_df[:, 3])
baseline_model.score(X, players_games_df[:, 3])
Out[24]:
0.7590245860041144
In [58]:
assert (
    baseline_model.coef_.shape[1]
    == player_enc.categories_[0].shape[0] + question_enc.categories_[0].shape[0]
)
In [ ]:
players_enc_idx = (
    player_enc.transform(np.array(list(players_games.keys())).reshape(-1, 1))
    .argmax(axis=1)
    .flatten()
)
players_proba = 1 / (
    1
    + np.exp(
        -baseline_model.intercept_
        + baseline_model.coef_[0, : player_enc.categories_[0].shape[0]][players_enc_idx]
    )
)
players_proba = {p_id: pp for p_id, pp in zip(players_games.keys(), players_proba[0])}
In [140]:
def team_p(team, players_proba):
    result = 1
    for player in team["teamMembers"]:
        result *= 1 - players_proba.get(player["player"]["id"], 0)
    return 1 - result
In [142]:
def compute_metrics(df, players_proba):
    metric_spearman = []
    metric_kendall = []
    for tour_id, _ in df.iterrows():
        check = [team_p(t, players_proba) for t in results_data[tour_id]]
        spearman = spearmanr(check, np.arange(len(check))).correlation
        kendall = kendalltau(check, np.arange(len(check))).correlation
        if np.isnan(spearman):
            spearman = 0
        if np.isnan(kendall):
            kendall = 0
        metric_spearman.append(spearman)
        metric_kendall.append(kendall)
        break

    metric_spearman = np.mean(metric_spearman)
    metric_kendall = np.mean(metric_kendall)
    return metric_spearman, metric_kendall


print("Train:", compute_metrics(tournaments_df_train, players_proba))
print("Test:", compute_metrics(tournaments_df_test, players_proba))
Train: (0.6665696372592925, 0.503707886316582)
Test: (0.6359447004608295, 0.48924988055422836)

Далее вводим скрытую переменную и запускаем итерации

In [150]:
eps = 1e-5


def em(p):
    for _ in range(3):
        model.fit(X, logit(p))

        players_proba = 1 / (
            1
            + np.exp(
                -model.intercept_
                + model.coef_[: player_enc.categories_[0].shape[0]][players_enc_idx]
            )
        )
        players_proba = {
            p_id: pp for p_id, pp in zip(players_games.keys(), players_proba[0])
        }

        print(compute_metrics(tournaments_df_test, players_proba))

        p = expit(model.predict(X))
        questions = defaultdict(list)
        curr_team = players_games_df[0, 2]
        prev_idx = 0
        z = []
        for idx, (r, curr_p) in enumerate(zip(players_games_df, p)):
            if r[2] != curr_team:
                for q in questions.keys():
                    questions[q] = np.prod(questions[q])
                for i in range(prev_idx, idx):
                    z.append(questions[players_games_df[i, 0]])
                questions = defaultdict(list)
                curr_team = r[2]
                prev_idx = idx
            questions[r[0]].append(1 - curr_p)
        for q in questions.keys():
            questions[q] = np.prod(questions[q])
        for i in range(prev_idx, idx + 1):
            z.append(questions[players_games_df[i, 0]])

        z = np.array(z)
        z = p / (1 - z)
        z = z * players_games_df[:, -1]
        p = np.clip(z, eps, 1 - eps)
In [165]:
model = LinearRegression()
initial_p = baseline_model.predict_proba(X)[:, 1] * players_games_df[:, -1]
em(np.clip(initial_p, eps, 1 - eps))
0.6791244522801549 0.5109023904505778
0.6825513414599562 0.5149939064122251
0.6827447215189244 0.515333255453505